/*Do-file for "Treating abandoned mine drainage can protect streams cost effectively and benefit vulnerable communities." Run on Stata/SE Version 18.0.
The dataset "KmProtectedData" is at the stream segment level, with each row corresponding to a stream segment, with 
variables that summarize discharges and costs of upstream systems.*/ 

clear
use KmProtectedData
	
*Calculating pH with and without treatment, assuming pH of 7 in receiving stream
*This combines the ions from the discharge with the ions of the receiving stream net of the discharge (=AvgFlow-flow)
gen ions0_liter=(ions0+((AvgFlow-flow)*(10^-7)))/AvgFlow
gen notreat_ph=-log10(ions0_liter)

gen ions1_liter=(ions1+((AvgFlow-flow)*(10^-7)))/AvgFlow
gen treat_ph=-log10(ions1_liter)

*Other measures with and without treatment, assuming zero concentration in the segment apart from discharge
foreach var in tfe tmn tal {
	qui: gen notreat_`var'=mg0_`var'/AvgFlow
	qui: gen treat_`var'=mg1_`var'/AvgFlow
}

*Impairment indicator with and without treatment
	gen impaired0=(notreat_ph <6 | notreat_tfe >1.5 | notreat_tmn>2 |notreat_tal>0.5)
	gen impaired1=(treat_ph <6 | treat_tfe >1.5 | treat_tmn>2 |treat_tal>0.5)
	
*********************************Main estimates***********************************************
	
*Method 1
	*Length of streams impaired without treatment but not impaired with treatment 
	*Creating variable that equals 1 if not impaired with treatment (impaired1==0) but is impaired without treatment (impaired0==1)
	gen iprotected1=(impaired0==1 & impaired1==0)
	*Summing the lengths of all segments protected from impairment because of treatment
	qui: sum Length if iprotected1==1
	scalar m1=r(sum)

	*Average flow of streams protected according to method 1
	mean AvgFlow if iprotected1==1

*Method 2 - km not impaired by AMD because they are not listed as such in the DEP report. The DEP impairment list is only for AMD impairment.  
	gen iprotected2=(mergeDEP==1 & impaired0==1)
	qui: sum Length if iprotected2==1
	scalar m2=r(sum)

*Key scalars: note that tc is a scalar that is the present value total cost of all systems considered in the analysis	
	scalar m=(m1+m2)/2
	scalar mc=(tc/m)
	scalar mc_yr=(tc/m)/25

	scalar cn=(m+8838)*mc

	matrix define r=m1\m2\m\tc\mc_yr\cn
	matrix rownames r= "Method 1" "Method 2" "Mean" "Total Cost" "Cost per km per year" "Total Need"

	matrix list r
	
	di "financial need:" (10381*5720*25)/10^9 /*This is "total km in need of treatment" * "cost per km per year" * "25 years", converted to billions of dollars*/ 
	
**************Bootstrapped 10 and 90 percent values for km protected and cost per km protected*************************

/*I randomly selected segments as opposed to systems. This is because randomly selecting systems and then merging
with segments would mean that some segments will have fewer upstream systems than they actually have and others
will have more upstream systems (and repeated upstream systems). This would be strange as it would create 
cases where a stream receives 3x or 1/3 the load it actually does. If selection is on segments, then 
conditional on the segment being selected, all the upstream systems area also selected, which makes sense,
but one segment could be in twice or not at all.*/ 

set seed 26721705 /*serial number from a random $20 bill*/

mata

/*get data*/

/*bring data into mata*/

	st_view(data=0,.,("iprotected1", "iprotected2","Length","cost"))
	n=rows(data)
	
	main_m=st_numscalar("m")			/*main results*/
	main_mc=st_numscalar("mc")
  
	B=1000						/*number of Bootstrap reps*/
	length_results=J(B,1,0)		/*matrix to store results from B reps*/
	cost_results=J(B,1,0)	
	
	for(i=1; i<=B;i++) {
	
	/*re-sampling data*/
		choose_rows=ceil(n*runiform(n,1))  	 /*randomly choosing n segments*/
		sampled=data[choose_rows,.]
		m1=sampled[.,1]:*sampled[.,3]		/*Grabbing length of protected streams through element by element multiplication, method 1*/
		m2=sampled[.,2]:*sampled[.,3]	    /*Grabbing length of protected streams, method 2*/
		m=(colsum(m1)+colsum(m2))/2			/*taking the average km of the two methods*/
		c=colsum(sampled[.,4])/m			/*calculating total cost / total km*/

	/*storing result, centered on the main estimate*/

		length_results[i]=(m-main_m)
		cost_results[i]=(c-main_mc)
		}
		
		length_results=sort(length_results,1) 
	    cost_results=sort(cost_results,1) 
		
		lb_length=main_m-length_results[950]
		ub_length=main_m-length_results[50]
		lb_cost=(main_mc-cost_results[950])/25	
		ub_cost=(main_mc-cost_results[50])/25

		results=(lb_length,ub_length)\(lb_cost, ub_cost)
		results
end		




